## library(devtools)
## library(BiocManager)
library(SingleCellExperiment)
library(ggplot2)
library(scFeatures) ## devtools::install_github("SydneyBioX/scFeatures")
library(ClassifyR) ## BiocManager::install("ClassifyR", dependencies = TRUE)
library(lisaClust)
library(ggthemes)
library(spicyR) ## BiocManager::install("spicyR")
library(dplyr)
library(limma)
library(plotly)
library(scattermore)
library(tidyr)
library(survival)
library(survminer)
library(spatstat)
library(scater)
library(scran)
library(reshape)
theme_set(theme_classic())
# code for plotting purpose
plot_boxplot <- function( feature ){
data <- t(feature)
patient <- unlist( lapply( strsplit( colnames(data), "_cond_"), `[`, 1) )
condition <- unlist( lapply( strsplit( colnames(data), "_cond_"), `[`, 2))
condition <- data.frame(sample = patient, condition = condition )
design <- model.matrix(~condition, data = condition)
fit <- lmFit(data, design)
fit <- eBayes(fit)
tT <- topTable(fit, n = Inf)
# selecting the top 10 DE features
top_gene <- rownames( tT)[1:10 ]
rownames( condition) <- colnames(data)
data_plot <- data[top_gene, ]
data_plot <- melt(data_plot )
colnames(data_plot) <- c("X1", "X2", "value")
data_plot$condition <- unlist( lapply( strsplit( as.character( data_plot$X2), "_cond_"), `[`, 2))
p <- ggplot(data_plot, aes( x = X1, y = value , colour = condition)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
return(p)
}
plot_barplot <- function(data , dodge=F ){
data$patient <- unlist( lapply( strsplit( rownames(data ), "_cond_"), `[`, 1))
data$condition <- unlist( lapply( strsplit( rownames(data ), "_cond_"), `[`, 2))
data <- as.data.frame( melt(data, id=c("patient", "condition")) )
p <- ggplot(data , aes( x = patient , y = value , fill = variable) ) +
geom_bar(stat="identity" ) + facet_wrap(~condition, scale="free") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
return (p)
}
As single cell technology advances, the recent development of spatial omics allows us to examine the spatial organisation of cells within tissues in their native environment. This workshop will discuss the challenges and analytical focus associated with using multi-sample spatial datasets for disease risk prognosis. We will also talk about general analytical strategies and the critical thinking questions that arise in the workflow.
sessionInfo at the end of this document to ensure you are
using the correct version. Familiarity with our previous workshop
vignette on Introduction to Single
Cell RNA-seq AnalysisClassifyRNote: This data analysis workshop offers
participants the opportunity to engage in hands-on analysis using R.
However, if you are not comfortable with coding in R, you can still
acquire valuable interpretation skills by reviewing the output we
provide in this file.
Structure of the 3-hour workshop:
| Activity | Time |
|---|---|
| Introduction | 15m |
| Exploring spatial data | 35m |
| Feature engineering with scFeatures | 50m |
| Break | 15m |
| Survival analysis with ClassifyR | 45m |
| Identify cohort heterogeneity | 20m |
| Concluding remarks |
The widely-known METABRIC breast cancer cohort has recently had imaging mass cytometry (cell-level resolution) generated for a subset of it. The publication describing this data is Imaging Mass Cytometry and Multiplatform Genomics Define the Phenogenomic Landscape of Breast Cancer, Nature Cancer, 2020. There are 483 cancer samples with IMC data. However, the subset of interest is those patients who do not have lymph node metastasis. Can their risk of recurrence accurately be estimated and therefore inform how aggressively they need to be treated? The other component of the analysis is patient clinical data, which has been sourced from Supplementary Table 5 of Dynamics of Breast-cancer Relapse Reveal Late-recurring ER-positive Genomic Subgroups, Nature, 2019.
The original data has been restricted to images with at least 400 cells, no lymph node cancer and Stage 1.
load("~/data/breastCancer.RData")
data_sce <- IMC
## Quick look at data
data_sce
## class: SingleCellExperiment
## dim: 38 76307
## metadata(0):
## assays(2): counts logcounts
## rownames(38): HH3_total CK19 ... H3K27me3 CK5
## rowData names(0):
## colnames(76307): MB-0002:345:93 MB-0002:345:107 ... MB-0663:394:577
## MB-0663:394:578
## colData names(17): file_id metabricId ... x_cord y_cord
## reducedDimNames(1): UMAP
## mainExpName: NULL
## altExpNames(0):
logcounts(data_sce)[1:5, 1:5]
## MB-0002:345:93 MB-0002:345:107 MB-0002:345:113 MB-0002:345:114
## HH3_total 2.5410994 2.2699132 1.83124965 2.54153682
## CK19 1.1452546 1.2155873 0.18690945 0.41539814
## CK8_18 1.9270420 1.7957914 0.68297921 1.77968459
## Twist 0.1972571 0.1604647 0.13589493 0.27251430
## CD68 0.1092099 0.1396621 0.04580369 0.07365269
## MB-0002:345:125
## HH3_total 1.77142549
## CK19 0.09835966
## CK8_18 0.27171782
## Twist 0.16355599
## CD68 0.00000000
Basic characteristics of the data objects:
## The object stores meta data (such as patient outcome information) about each cell
## the metadata is stored in colData()
# we need to convert to a data.frame to be input into datatable function
DT::datatable( data.frame(colData(data_sce))[1:5, ], options = list(scrollX = TRUE))
At the start of any analysis pipeline, it is often good to explore the data to get a sense of the complexity. We usually do this by exploring the distribution of the outcomes and various variables in the patients’ meta-data. Here, we use cross-tabulation to examine the following variables:
print("Stage and death")
## [1] "Stage and death"
table(clinical$Breast.Surgery, clinical$Death, useNA = "ifany")
##
## 0 1
## BREAST CONSERVING 38 14
## MASTECTOMY 16 6
## <NA> 2 1
print("Number of patients based on ER status")
## [1] "Number of patients based on ER status"
table(clinical$ER.Status, useNA = "ifany")
##
## neg pos <NA>
## 12 64 1
print("Number of patients based on Grade")
## [1] "Number of patients based on Grade"
table(clinical$Grade , useNA = "ifany")
##
## 1 2 3 <NA>
## 14 30 28 5
Typically in single-cell data analysis, we perform dimension reduction to project the high dimensional cell x gene matrix on to 2D space. This allows us to visualise various things of interest, such as distribution of cell types and disease outcomes. In this dataset, cells were classified into 22 cell types based on their markers.
data_sce <- runUMAP(data_sce, scale=TRUE)
# With the UMAP function we can highlight by meta data of interest
# Here we highlight the cell types and sample ID
a <- plotUMAP(data_sce, colour_by = "description")
b <- plotUMAP(data_sce, colour_by = "metabricId")
ggarrange( plotlist = list(a,b))
Interactive Q&A:
What can we learn from these illustrations? Is there anything interesting in the plot? Questions to consider include:
Visualise selected patient:
Here we selectively visualise two patients by highlighting only the
selected patients.
# extract the meta data and the UMAP dimension of the dataset
metadata <- colData(data_sce)
metadata <- cbind(metadata, reducedDim(data_sce, "UMAP"))
metadata <- data.frame(metadata)
# for the selected patient, denote it as "selected patient" and all other patients as "other patients"
metadata$selected_patient <- ifelse( metadata$metabricId == "MB-0475", "seleted patient" , "other patients")
a <- ggplot(metadata, aes(x =UMAP1 , y = UMAP2 , colour = selected_patient )) + geom_scattermore(pointsize = 0.5) + scale_colour_manual(values = c("grey" , "red"))
# repeat for another patient
metadata$selected_patient <- ifelse( metadata$metabricId == "MB-0628", "seleted patient" , "other patients")
b <- ggplot(metadata, aes(x =UMAP1 , y = UMAP2 , colour = selected_patient )) + geom_scattermore(pointsize = 0.5) + scale_colour_manual(values = c("grey" , "red"))
ggarrange(plotlist = list(a,b ))
The advantage with spatial omics is that we can examine the organisation of the cell types as it occurs on the tissue slide. Here, we visualise one of the slides from a patient. As an optional exercise, you can:
to give a sense of what is random ordering.
Here we choose a particular patient “MB-0263” and visualise its spatial pattern.
# obtaining the meta data for this patient
one_sample <- data_sce[, data_sce$metabricId == "MB-0263"]
one_sample <- colData(one_sample)
one_sample <- data.frame(one_sample)
a <- ggplot(one_sample, aes(x = Location_Center_X , y = Location_Center_Y, colour = description)) + geom_point(alpha=0.7) + scale_colour_tableau() + ggtitle("Original slide")
The code here investigates permutations of spatially resolved data. Please examine the next tab for actual results.
# "Optional: Permute the cell type labels"
one_sample$description_permute <- sample(one_sample$description)
b <- ggplot(one_sample, aes(x = Location_Center_X , y = Location_Center_Y, colour =description_permute)) + geom_point(alpha=0.7) + scale_colour_tableau() + ggtitle("Permute the cell type label")
# "Optional: Permute the spatial coordinate"
one_sample$Location_Center_X_permute <- sample(one_sample$Location_Center_X)
one_sample$Location_Center_Y_permute <- sample(one_sample$Location_Center_Y)
c <- ggplot(one_sample, aes(x = Location_Center_X_permute , y = Location_Center_Y_permute, colour = description)) + geom_point(alpha=0.7) + scale_colour_tableau() + ggtitle("Permute the X, Y coordinate")
The aim here is to have an understanding of the concept of spatial randomness. Spatial statistics is a topic of study that encompasses a wide range of research. The next two code chucks focus on the examination of two distinct permutation strategies. The objective of this investigation is to gain an understanding of how various permutation strategies might yield varied perceptions of randomness.
## ggarrange provides a way to arrange multiple ggplots for efficient visual comparison
ggarrange( plotlist = list(a, b, c), ncol = 3)
Interactive Q&A:
Q3: Is there a structure in the data or is the cell type randomly distribution?
Spatial data allow for the identification of a variety of characteristics including distinct cell types within an image, providing an overview of the tissue environment. This allows scientists to explore the cellular architecture and environment and its association with phenotype information (e.g meta-data). For our data story, we are interested in whether the patients have a good or poor outcome. The outcome is often called a ‘prognosis’ and a good outcome is sometimes called ‘favourable’. In this section, we examine graphically how cell-type co-localisation varies across spatial regions and how is such information associated with individual survival outcome.
We begin by examining how can we identify and visualise regions of tissue where spatial associations between cell-types are similar. There are many packages that perform this task and here we use the lisaClust function that is based on “local L-function” to spatially cluster cells into different regions with similar cell type composition.
This has already been pre-loaded for you. Please proceed to the next task.
set.seed(51773)
BPPARAM <- MulticoreParam(16)
# Cluster cells into spatial regions with similar composition.
data_sce <- lisaClust(
data_sce ,
k = 5,
Rs = c(20, 50, 100),
sigma = 50,
spatialCoords = c("Location_Center_X", "Location_Center_Y"),
cellType = "description",
imageID = "ImageNumber" ,
regionName = "region",
BPPARAM = BPPARAM
)
As a case study, we first compare individuals with good or poor prognosis. We define:
We examine different ways of visualising data.
Here we visualise the region result based on one individual.
# get meta data for a selected patient to visualise
metadata <- colData(data_sce)
metadata <- metadata[metadata$metabricId == "MB-0628", ]
metadata <- data.frame(metadata)
plotlist <- list() # define the list to store images for region highlighting
plotlist_celltype <- list() # define the list to store images for celltype highlighting
# optional: define a colour palette
# you can also specify your own colour to use in the variable color_codes
tableau_palette <- scale_colour_tableau( palette = "Tableau 20")
color_codes <- tableau_palette$palette(18)
color_codes <- c(color_codes, "darkgrey") # for all other regions apart from region of interest, make the colour grey
names(color_codes) <- c(unique(metadata$description) , "other regions")
# look through each region to highlight the region of interest, as well as the cell type in the region of interest
for ( thisregion in sort(unique(metadata$region))){
# select the region of interest
selected_region_index <- metadata$region == thisregion
# for all other regions, define them as "other regions" so that they will be greyed out
metadata$selected_region <- "other regions"
metadata$selected_region[selected_region_index] <- "selected region"
# for all cell types outside the region of the interest, also make them greyed out
metadata$celltype <- metadata$description
metadata$celltype[!selected_region_index ] <- "other regions"
metadata$celltype <- factor(metadata$celltype, levels = c(unique(metadata$description), "other regions"))
# plot ggplot highlighting the region of interest
p <- ggplot(metadata, aes(x = Location_Center_X , y = Location_Center_Y , colour = selected_region )) + geom_point(alpha = 0.7) + ggtitle(thisregion) + scale_colour_manual(values = c("grey" , "red"))
# plot ggplot highlighting the celltypes in the region of interest
p2 <- ggplot(metadata, aes(x = Location_Center_X , y = Location_Center_Y , colour = celltype )) + geom_point(alpha = 0.7 ) + ggtitle(thisregion) + scale_colour_manual(values = color_codes)
plotlist [[thisregion ]] <- p
plotlist_celltype [[thisregion ]] <- p2
}
ggarrange(plotlist = plotlist , ncol = 5, nrow = 1 , common.legend = T )
ggarrange(plotlist = plotlist_celltype , ncol = 5, nrow = 1 , common.legend = T )
We can better interpret the region output by summarising the proportion of each cell type in a region across the patients. Looking at the composition of cell types in each region, comparing between prognostic outcome.
df <- data.frame(colData( data_sce))
clinical$survivalgroup <- clinical$timeRFS
clinical$survivalgroup[which( clinical$timeRFS < 5* 365) ] <- "less than 5 years"
clinical$survivalgroup[which( clinical$timeRFS > 10* 365) ] <- "more than 10 years"
# for each patient, calculate the proportion of each cell type in each region.
df_plot <- NULL
for ( thispatient in unique(df$metabricId)){
this_df <- df[df$metabricId == thispatient, ]
temp_df <- table( this_df$region, this_df$description )
temp_df <- temp_df / rowSums(temp_df)
temp_df <- data.frame( temp_df)
temp_df$patient <- thispatient
temp_df$group <- clinical[clinical$metabricId == thispatient, "survivalgroup"]
df_plot <- rbind(df_plot, temp_df)
}
# for each region, calculate the average proportion of each cell type across all patients
df_plot <- df_plot %>% dplyr::group_by( Var1 , Var2, group) %>%
summarise(mean_proportion = mean(Freq))
# we are only interested in the short term and long term survival patients
df_plot <- df_plot [ df_plot$group %in% c("less than 5 years", "more than 10 years"), ]
ggplot(df_plot, aes(y = Var2, x = Var1 ,colour =mean_proportion , size = mean_proportion ))+ geom_point() +
facet_grid(~group, scales = "free", space = "free" ) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
xlab("Region" ) + ylab("Celltype") + scale_colour_viridis_c()
Interactive Q&A:
Q4: Which regions appear to be different between poor prognosis (short-term survival) and good prognosis (long-term survival) patients?
Let’s focus on region 5 and visualise the data with boxplots.
df <- clinical[colData(data_sce)[, "metabricId"], ]
df$region <- data_sce$region
df$description <- data_sce$description
df_plot <- NULL
for ( thispatient in unique(df$metabricId)){
this_df <- df[df$metabricId == thispatient, ]
temp_df <- table( this_df$region, this_df$description )
temp_df <- temp_df / rowSums(temp_df)
temp_df <- data.frame( temp_df)
temp_df$patient <- thispatient
temp_df$group <- unique( this_df$survivalgroup)
df_plot <- rbind(df_plot, temp_df)
}
df_plot_region <- df_plot[df_plot$Var1 == "region_5", ]
df_plot_region <- df_plot_region [ df_plot_region$group %in% c("less than 5 years", "more than 10 years"), ]
ggplot(df_plot_region, aes(x = Var2, y = Freq, colour = group)) +
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ylab("Proportion") + xlab("Cell type") + ggtitle("Selected region") + ylim(0, 0.25)
In this demo, we use scFeatures to generate a molecular representation for each patient from the matrix of proteins x cells. The molecular representation is interpretable and hence facilitates downstream analysis of the patient. Overall, scFeatures generates features across six categories representing different molecular views of cellular characteristics. These include:
The different types of features constructed enable a more comprehensive multi-view understanding of each patient from a matrix of proteins x cells.
Given in the previous section, we clustered the cells into regions, we can use the region information as an additional layer of information on top of the cell types to examine region-specific cell-type specific features.
region <- data_sce$region
region <- gsub("_" , "", region)
# concat the region information to the cell type
data_sce$celltype <- paste0( data_sce$description , "-" , region)
print("number of cells in each sample")
## [1] "number of cells in each sample"
DT::datatable( data.frame(table( data_sce$metabricId )) , options = list(scrollX = TRUE))
print("number of cells in each celltype - Region specific cell type")
## [1] "number of cells in each celltype - Region specific cell type"
DT::datatable( data.frame(table( data_sce$celltype)) , options = list(scrollX = TRUE))
Discussion:
Q5: Are there any samples or cell types you would like to remove from the data?
There are different ways you can use scFeatures to generate molecular representation for patients.
scFeatures can also generate selected feature types. For example, in the below code we run scFeatures to generate cell type proportion.
# scFeatures requires the following information
# data, sample, X, Y coordinates
IMCmatrix <- assay(data_sce)
celltype <- data_sce$celltype
spatialCoords <- list(colData(data_sce)$Location_Center_X, colData(data_sce)$Location_Center_Y)
sample <- data_sce$metabricId
cond <- clinical[ match(sample, clinical$metabricId), ]$survivalgroup
sample <- paste0(sample, "_cond_", cond ) # append the condition to the patient so we can easily retrieve the patient condition
scfeatures_result <- scFeatures( IMCmatrix,
sample = sample, celltype = celltype, spatialCoords = spatialCoords,
feature_types = "proportion_raw", type = "spatial_p" )
feature <- scfeatures_result$proportion_raw
feature <- feature[ grep("less than|more than", rownames(feature )), ]
plot_barplot( feature ) + ggtitle("Proportion raw feature")
Here we select the HR- CK7+ cell type.
index <- grep("HR- CK7+-region" , celltype, fixed=T)
selected_celltype <- celltype[ index]
selected_data <- IMCmatrix[, index]
selected_spatialCoords <- list(colData(data_sce)$Location_Center_X[index],
colData(data_sce)$Location_Center_Y[index])
selected_sample <- sample[index]
scfeatures_result <- scFeatures( selected_data,
sample = selected_sample, celltype = selected_celltype,
spatialCoords = selected_spatialCoords,
feature_types = "proportion_raw", type = "spatial_p" )
feature <- scfeatures_result$proportion_raw
feature <- feature[ grep("less than|more than", rownames(feature )), ]
plot_barplot( feature ) + ggtitle("Proportion raw feature")
The code below generates all feature types for all cell types, to save time we won’t run it in today’s workshop, please just load the prepared RDS file in the following chunk.
# here, we specify that this is a spatial proteomics data
# scFeatures support parallel computation to speed up the process
scfeatures_result <- scFeatures(IMCmatrix, type = "spatial_p",
sample = sample, celltype = celltype, spatialCoords = spatialCoords,
ncores = 32)
scfeatures_result <- readRDS("~/data/scfeatures_result.RDS")
From the section above, we have generated a total of 13 feature types
and stored them in a list. All generated feature types are stored in a
matrix of samples x features. For example, the first list element
contains the feature type proportion_raw, which contains
the cell type proportion features for each patient sample. We could
print out the first 5 columns and first 5 rows of the first element to
see.
type(scfeatures_result)
## [1] "list"
# we have generated a total of 13 feature types
names(scfeatures_result)
## [1] "proportion_raw" "proportion_logit" "proportion_ratio"
## [4] "gene_mean_celltype" "gene_prop_celltype" "gene_cor_celltype"
## [7] "gene_mean_bulk" "gene_prop_bulk" "gene_cor_bulk"
## [10] "L_stats" "celltype_interaction" "morans_I"
## [13] "nn_correlation"
lapply(scfeatures_result, dim)
## $proportion_raw
## [1] 77 110
##
## $proportion_logit
## [1] 77 110
##
## $proportion_ratio
## [1] 77 5995
##
## $gene_mean_celltype
## [1] 77 4180
##
## $gene_prop_celltype
## [1] 77 4180
##
## $gene_cor_celltype
## [1] 77 27958
##
## $gene_mean_bulk
## [1] 77 38
##
## $gene_prop_bulk
## [1] 77 38
##
## $gene_cor_bulk
## [1] 77 703
##
## $L_stats
## [1] 77 3842
##
## $celltype_interaction
## [1] 77 4393
##
## $morans_I
## [1] 77 38
##
## $nn_correlation
## [1] 77 38
# each row is a sample, each column is a feature
data.frame(scfeatures_result[[1]][1:5, 1:5])
## B.cells.region1 B.cells.region2 B.cells.region3
## MB-0002_cond_2539 0.0000000000 0.000000000 0
## MB-0064_cond_3268 0.0000000000 0.000000000 0
## MB-0128_cond_more than 10 years 0.0009950249 0.044776119 0
## MB-0130_cond_more than 10 years 0.0000000000 0.000000000 0
## MB-0132_cond_2752 0.0000000000 0.007174888 0
## B.cells.region4 B.cells.region5
## MB-0002_cond_2539 0.00000000 0.000000000
## MB-0064_cond_3268 0.00000000 0.000000000
## MB-0128_cond_more than 10 years 0.06666667 0.007960199
## MB-0130_cond_more than 10 years 0.00000000 0.000000000
## MB-0132_cond_2752 0.00000000 0.000000000
The next question is, how can we visually explore all the features.
Once the features are generated, you may wish to visually explore the features.
Here we plot a volcano plot and a dotplot for the cell type specific expression feature.
gene_mean_celltype <- scfeatures_result$gene_mean_celltype
# select a certian cell type in a certain region
gene_mean_celltype <- gene_mean_celltype [, grep( "HR+ CK7--region5", colnames(gene_mean_celltype) , fixed= T) ]
gene_mean_celltype <- t(gene_mean_celltype)
condition <- unlist( lapply( strsplit( colnames(gene_mean_celltype) , "_cond_"), `[`, 2))
condition <- data.frame(sample = colnames(gene_mean_celltype), condition = condition )
# again , we are only interested in the short term survival and long term survival patients
select_index <- which( condition$condition %in% c("less than 5 years", "more than 10 years" ))
condition <- condition[ select_index, ]
gene_mean_celltype<- gene_mean_celltype [ , select_index]
# standard code to calculate log fold change each protein
design <- model.matrix(~condition, data = condition)
fit <- lmFit(gene_mean_celltype, design)
fit <- eBayes(fit)
tT <- topTable(fit, n = Inf)
tT$gene <- rownames(tT)
p <- ggplot( tT , aes(logFC,-log10(P.Value) , text = gene ) )+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=1) +
scale_colour_gradient(low="blue",high="red")+
xlab("log2 fold change") + ylab("-log10 p-value")
p
# order the proteins by log fold change
tT <- tT[ order(tT$logFC, decreasing = T), ]
tT <- tT[1:20, ]
ggplot( tT , aes( y = reorder(gene, logFC) , x = logFC ) )+
geom_point(aes(colour=-log10(P.Value)), alpha=1/3, size=4) +
scale_colour_gradient(low="blue",high="red")+
xlab("logFC") + ylab("region specific cell type specfic features" )
Interactive Q&A:
Q6: Which figure do you prefer? The volcano plot or the dotplot?
We can also visualise the distribution of the features across the patients. This helps to examine whether there exists any batch effect due to patient effect.
feature <- scfeatures_result$proportion_raw
feature <- feature[ grep("less than|more than", rownames(feature )), ]
plot_barplot(feature ) + ggtitle("Proportion raw feature")
feature <- scfeatures_result$gene_mean_celltype
feature <- feature[ grep("less than|more than", rownames(feature )), ]
feature <- feature[ , grep( "HR+ CK7--region5" , colnames(feature) , fixed=T) ] # select the features from a particular cell type and a particular region
plot_boxplot(feature) + ggtitle("Gene mean celltype feature")
feature <- scfeatures_result$gene_mean_bulk
feature <- feature[ grep("less than|more than", rownames(feature )), ]
plot_boxplot(feature) + ggtitle("Gene mean bulk feature")
feature <- scfeatures_result$nn_correlation
feature <- feature[ grep("less than|more than", rownames(feature )), ]
plot_boxplot(feature) + ggtitle("Nearest neighbour correlation feature")
To accommodate for easier interpretation of the features, scFeatures
contains a function run_association_study_report that
enables the user to readily visualise and explore all generated features
with one line of code.
# specify a folder to store the html report. Here we store it in the current working directory.
output_folder <- getwd()
scfeatures_result_new <- scfeatures_result
for (i in 1:13){
thisfeature <- scfeatures_result[[i]]
thisfeature <- thisfeature[ grep("less than|more than", rownames(thisfeature )), ]
scfeatures_result_new[[i]] <- thisfeature
}
run_association_study_report(scfeatures_result_new, output_folder )
Recurrence risk estimation is a fundamental concern in medical research, particularly in the context of patient survival analysis. In this section, we will estimate recurrence risk using the molecular representation of patients to build a survival model. We will use classifyR to build the survival model. The patient outcome is time-to-event, so, by default, ClassifyR will use Cox proportional hazards ranking to choose a set of features and also Cox proportional hazards to predict risk scores. We will also demonstrate other available models in ClassifyR.
Recall in the previous section that we have stored the 13 matrices of different feature types in a list. Instead of manually retrieving each matrix from the list to build separate models, classifyR can directly take a list of matrices as an input and run a repeated cross-validation model on each matrix individually. Below, we run 5 repeats of 5-fold cross-validation. A high score indicates prognosis of a worse outcome than a lower risk score. To save time, just load the prepared RDS file.
# We use the following variables:
# timeRFS: "Time to Recurrence-Free Survival." It is the time period until recurrence occurs.
# eventRFS: "Event in Recurrence-Free Survival."It indicates whether the event has occurred.
# Breast.Tumour.Laterality: Laterality of tumors, eg, whether the tumor is located in left or right.
# ER.Status: Whether the tumor is ER positive or ER negative.
# Inferred.Menopausal.State: of the patient.
# Grade: of the tumor.
# Size: of the tumor.
usefulFeatures <- c("Breast.Tumour.Laterality", "ER.Status", "Inferred.Menopausal.State", "Grade", "Size")
nFeatures <- append(list(clinical = 1:3), lapply(scfeatures_result, function(metaFeature) 1:5)) # nFeatures provides a range of values of top-ranked features to try. It limits the model to be smaller. Use 3 for clinical features and 5 for scFeatures
clinicalAndOmics <- append(list(clinical = clinical), scfeatures_result)
### generate classfyr result
classifyr_result <- crossValidate(clinicalAndOmics, c("timeRFS", "eventRFS"),
extraParams = list(prepare = list(useFeatures = list(clinical = usefulFeatures))),
nFeatures = nFeatures, nFolds = 5, nRepeats = 5, nCores = 5)
classifyr_result <- readRDS("~/data/classifyr_result_IMC.rds")
Cox proportional hazards is a classical statistical method, as opposed to machine learning methods like Random survival forest. These machine learning methods can build remarkably complex relationships between features, however their running time can be much longer than Cox proportional hazards. We use feature selection to limit the number of features considered to at most 100 per metafeature and to save time, you can just load the prepared RDS file. We will compare the predictive performance between these methods.
nFeatures <- append(list(clinical = 1:3), lapply(scfeatures_result[2:length(scfeatures_result)], function(metaFeature) min(100, ncol(metaFeature))))
survForestCV <- crossValidate(clinicalAndOmics, outcome, nFeatures = nFeatures,
classifier = "randomForest",
nFolds = 5, nRepeats = 5, nCores = 5)
survForestCV <- readRDS("~/data/survForestCV.RDS")
To examine the distribution of prognostic performance, use
performancePlot. If it is categorical data, the
automatically chosen performance metric is balanced accuracy and if it
is survival data, then it will automatically choose C-index.
tilt <- theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
performancePlot(append(classifyr_result, survForestCV),
characteristicsList = list(x = "Assay Name", row = "Classifier Name"),
orderingList = list("Assay Name" = c("clinical", names(scfeatures_result)))) + tilt
Note how the resultant plot is a ggplot2 object and can
be further modified. The same code could be used for a categorical
classifier because the random forest implementation provided by the
ranger package has the same interface for both.
Next, examine feature selection stability with
selectionPlot.
selectionPlot(append(classifyr_result, survForestCV),
characteristicsList = list(x = "Assay Name", row = "Classifier Name"),
orderingList = list("Assay Name" = c("clinical", names(scfeatures_result)))) + tilt
# If we have 5 repeats of 5 folds cross-validation we have 25 feature selections. The Y-axis is the percentage of the 25 cross-validations a feature is selected in. You'll want to see the boxes high to have feature selected stability.
distribution(classifyr_result[[2]], plot = FALSE)
## assay feature proportion
## 6 proportion_raw HR- Ki67+-region4 1.00
## 4 proportion_raw HER2+-region4 0.88
## 3 proportion_raw HER2+-region1 0.64
## 2 proportion_raw Basal CKlow-region4 0.36
## 10 proportion_raw Macrophages Vim+ Slug+-region3 0.32
## 9 proportion_raw Macrophages Vim+ Slug--region2 0.12
## 1 proportion_raw B cells-region3 0.04
## 5 proportion_raw HR- Ki67+-region1 0.04
## 7 proportion_raw HR+ CK7- Ki67+-region3 0.04
## 8 proportion_raw HRlow CKlow-region5 0.04
Using samplesMetricMap compare the per-sample C-index
for Cox and Random Forest models for metafeature
gene_cor_celltype.
library(grid)
heatmap <- samplesMetricMap(list(classifyr_result[[7]], survForestCV[[7]]))
grid.draw(heatmap)
A few samples are predicted better by one model than another.
The per-sample C-index is a metric unique to ClassifyR. Models and
feature selection approaches may be seen in the vignette or listed by
available().
Interactive Q&A:
Q7: Is the highest predictive performance the only way to choose the best model or can other models be better for other reasons?
The L function is a spatial statistic used to assess the spatial distribution of cell types. It assesses the significance of cell-cell interactions, by calculating the density of a cell type with other cell types within a certain radius. High values indicate spatial association, low values indicate spatial avoidance.
# select one patient
one_sample <- data_sce[ , data_sce$metabricId == "MB-0128" ]
one_sample <- data.frame( colData(one_sample) )
one_sample$celltype <- one_sample$description
# select certain cell types to examine the interaction
index <- one_sample$celltype %in% c("B cells", "Fibroblasts")
one_sample$celltype[!index] <- "others"
a <-ggplot( one_sample, aes(x = Location_Center_X , y = Location_Center_Y, colour = celltype )) + geom_point() + scale_colour_manual(values = color_codes) + ggtitle( "Patient MB-0128 - high L value with \n B cells interacting Fibroblasts")
one_sample$celltype <- one_sample$description
index <- one_sample$celltype %in% c("melano", "Tc.ae")
one_sample$celltype[!index] <- "others"
b <- ggplot( one_sample, aes(x = Location_Center_X , y = Location_Center_Y, colour = celltype )) + geom_point() + scale_colour_manual(values = color_codes) + ggtitle( "Patient MB-0128 - low L value with \n B cells interacting HR_ CK7")
ggarrange(plotlist = list(a,b))
We calculate the nearest neighbours of each cell and then calculate the pairs of cell types based on the nearest neighbour. This allows us to summarise it into a cell type interaction composition.
one_sample <- data_sce[ , data_sce$metabricId == "MB-0263" ]
one_sample <- data.frame( colData(one_sample) )
one_sample$celltype <- one_sample$description
a <-ggplot( one_sample, aes(x = Location_Center_X , y = Location_Center_Y, colour = celltype )) + geom_point() + scale_colour_manual(values = color_codes) + ggtitle( "Patient MB-0263")
feature <- scfeatures_result$celltype_interaction
to_plot <- data.frame( t( feature[ "MB-0263_cond_more than 10 years" , ]) )
to_plot$feature <- rownames(to_plot)
colnames(to_plot) <- c("value", "celltype interaction composition")
to_plot <- to_plot[ to_plot$value > 0.03 , ]
b <- ggplot(to_plot, aes(x = reorder(`celltype interaction composition`, value) , y = value, fill=`celltype interaction composition`)) + geom_bar(stat="identity" ) + ylab("Major cell type interactions") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
ggarrange(plotlist = list(a,b))
Moran’s I is a spatial autocorrelation statistic based on both location and values. It quantifies whether similar values tend to occur near each other or are dispersed.
high <- data_sce["Ki67", data_sce$metabricId == "MB-0132" ]
high_meta <- data.frame( colData(high) )
high_meta$expression <- as.vector(logcounts( high))
low <- data_sce["Ki67", data_sce$metabricId == "MB-0249" ]
low_meta <- data.frame( colData(low) )
low_meta$expression <- as.vector(logcounts(low))
a <- ggplot(high_meta, aes(x = Location_Center_X , y = Location_Center_Y, colour =expression)) + geom_point(alpha=0.5) + scale_colour_viridis_c() + ggtitle("Patient MB-0132 - high Moran's I in Ki67")
b <- ggplot(low_meta, aes(x = Location_Center_X , y = Location_Center_Y, colour =expression)) + geom_point(alpha=0.5) + scale_colour_viridis_c() + ggtitle("Patient MB-0249 - low Moran's I in Ki67")
ggarrange(plotlist = list(a,b))
This metric measures the correlation of proteins/genes between a cell and its nearest neighbour cell.
plot_nncorrelation <- function(thissample , thisprotein){
sample_name <- thissample
thissample <- data_sce[, data_sce$metabricId == sample_name]
exprsMat <- logcounts(thissample)
# calculate NN correlation
cell_points_cts <- spatstat.geom::ppp(
x = as.numeric(thissample$Location_Center_X ), y = as.numeric(thissample$Location_Center_Y),
check = FALSE,
xrange = c(
min(as.numeric(thissample$Location_Center_X)),
max(as.numeric(thissample$Location_Center_X))
),
yrange = c(
min(as.numeric(thissample$Location_Center_Y)),
max(as.numeric(thissample$Location_Center_Y))
),
marks = t(as.matrix(exprsMat))
)
value <- spatstat.explore::nncorr(cell_points_cts)["correlation", ]
value <- value[ thisprotein]
# Find the indices of the two nearest neighbors for each cell
nn_indices <- nnwhich(cell_points_cts, k = 1)
protein <- thisprotein
df <- data.frame(thiscell_exprs = exprsMat[protein, ] , exprs = exprsMat[protein,nn_indices ])
p <- ggplot(df, aes( x =thiscell_exprs , y = exprs , colour = exprs )) +
geom_point(alpha = 0.3) + ggtitle(paste0( "Patient ", sample_name , " nn_corr = " , round(value, 2) )) + scale_colour_viridis_c() + xlab("This cell expression") + ylab("Neighbouring cell expression")
return (p )
}
p1 <- plot_nncorrelation("MB-0605", "HER2")
p2 <- plot_nncorrelation("MB-0258", "HER2")
ggarrange(plotlist = list(p1, p2))
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 12 (bookworm)
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.21.so; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Sydney
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] reshape_0.8.9 scran_1.28.1
## [3] scater_1.28.0 scuttle_1.10.1
## [5] spatstat_3.0-6 spatstat.linnet_3.1-1
## [7] spatstat.model_3.2-4 rpart_4.1.19
## [9] spatstat.explore_3.2-1 nlme_3.1-162
## [11] spatstat.random_3.1-5 spatstat.geom_3.2-1
## [13] spatstat.data_3.0-1 survminer_0.4.9
## [15] ggpubr_0.6.0 tidyr_1.3.0
## [17] scattermore_0.8 plotly_4.10.2
## [19] limma_3.56.2 dplyr_1.1.2
## [21] spicyR_1.12.0 ggthemes_4.2.4
## [23] lisaClust_1.9.2 ClassifyR_3.5.21
## [25] survival_3.5-5 BiocParallel_1.34.2
## [27] MultiAssayExperiment_1.26.0 generics_0.1.3
## [29] scFeatures_0.99.27 ggplot2_3.4.2
## [31] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
## [33] Biobase_2.60.0 GenomicRanges_1.52.0
## [35] GenomeInfoDb_1.36.1 IRanges_2.34.1
## [37] S4Vectors_0.38.1 BiocGenerics_0.46.0
## [39] MatrixGenerics_1.12.2 matrixStats_1.0.0
##
## loaded via a namespace (and not attached):
## [1] ProtGenerics_1.32.0 GSVA_1.48.2
## [3] spatstat.sparse_3.0-2 bitops_1.0-7
## [5] httr_1.4.6 RColorBrewer_1.1-3
## [7] numDeriv_2016.8-1.1 EnsDb.Hsapiens.v79_2.99.0
## [9] tools_4.3.1 backports_1.4.1
## [11] DT_0.28 utf8_1.2.3
## [13] R6_2.5.1 HDF5Array_1.28.1
## [15] mgcv_1.8-42 lazyeval_0.2.2
## [17] rhdf5filters_1.12.1 withr_2.5.0
## [19] gridExtra_2.3 prettyunits_1.1.1
## [21] cli_3.6.1 labeling_0.4.2
## [23] sass_0.4.6 survMisc_0.5.6
## [25] SingleCellSignalR_1.12.0 Rsamtools_2.16.0
## [27] ggupset_0.3.0 R.utils_2.12.2
## [29] rstudioapi_0.14 RSQLite_2.3.1
## [31] shape_1.4.6 BiocIO_1.10.0
## [33] crosstalk_1.2.0 gtools_3.9.4
## [35] car_3.1-2 scam_1.2-14
## [37] Matrix_1.5-4.1 ggbeeswarm_0.7.2
## [39] fansi_1.0.4 abind_1.4-5
## [41] R.methodsS3_1.8.2 lifecycle_1.0.3
## [43] yaml_2.3.7 edgeR_3.42.4
## [45] carData_3.0-5 gplots_3.1.3
## [47] rhdf5_2.44.0 BiocFileCache_2.8.0
## [49] Rtsne_0.16 blob_1.2.4
## [51] promises_1.2.0.1 dqrng_0.3.0
## [53] crayon_1.5.2 lattice_0.21-8
## [55] cowplot_1.1.1 beachmat_2.16.0
## [57] msigdbr_7.5.1 GenomicFeatures_1.52.1
## [59] annotate_1.78.0 KEGGREST_1.40.0
## [61] magick_2.7.4 pillar_1.9.0
## [63] knitr_1.43 metapod_1.8.0
## [65] boot_1.3-28.1 rjson_0.2.21
## [67] codetools_0.2-19 glue_1.6.2
## [69] data.table_1.14.8 vctrs_0.6.1
## [71] png_0.1-8 gtable_0.3.4
## [73] cachem_1.0.8 xfun_0.39
## [75] S4Arrays_1.0.6 mime_0.12
## [77] pheatmap_1.0.12 iterators_1.0.14
## [79] KMsurv_0.1-5 statmod_1.5.0
## [81] bluster_1.10.0 ellipsis_0.3.2
## [83] bit64_4.0.5 progress_1.2.2
## [85] filelock_1.0.2 bslib_0.5.0
## [87] irlba_2.3.5.1 vipor_0.4.5
## [89] KernSmooth_2.23-21 colorspace_2.1-0
## [91] DBI_1.1.3 tidyselect_1.2.0
## [93] proxyC_0.3.3 bit_4.0.5
## [95] compiler_4.3.1 curl_5.0.2
## [97] AUCell_1.22.0 graph_1.78.0
## [99] BiocNeighbors_1.18.0 xml2_1.3.4
## [101] DelayedArray_0.26.6 rtracklayer_1.60.0
## [103] scales_1.2.1 caTools_1.18.2
## [105] rappdirs_0.3.3 stringr_1.5.0
## [107] SpatialExperiment_1.11.2 digest_0.6.33
## [109] goftest_1.2-3 minqa_1.2.5
## [111] spatstat.utils_3.0-3 rmarkdown_2.23
## [113] XVector_0.40.0 htmltools_0.5.5
## [115] pkgconfig_2.0.3 lme4_1.1-34
## [117] sparseMatrixStats_1.12.2 highr_0.10
## [119] dbplyr_2.3.2 fastmap_1.1.1
## [121] ensembldb_2.24.0 htmlwidgets_1.6.2
## [123] rlang_1.1.1 GlobalOptions_0.1.2
## [125] shiny_1.7.4 DelayedMatrixStats_1.22.1
## [127] farver_2.1.1 jquerylib_0.1.4
## [129] zoo_1.8-12 jsonlite_1.8.7
## [131] R.oo_1.25.0 BiocSingular_1.16.0
## [133] RCurl_1.98-1.12 magrittr_2.0.3
## [135] GenomeInfoDbData_1.2.10 Rhdf5lib_1.22.0
## [137] munsell_0.5.0 Rcpp_1.0.10
## [139] viridis_0.6.3 ape_5.7-1
## [141] babelgene_22.9 stringi_1.7.12
## [143] zlibbioc_1.46.0 MASS_7.3-59
## [145] plyr_1.8.8 ggrepel_0.9.3
## [147] parallel_4.3.1 deldir_1.0-9
## [149] Biostrings_2.68.1 splines_4.3.1
## [151] tensor_1.5 multtest_2.56.0
## [153] hms_1.1.3 circlize_0.4.15
## [155] locfit_1.5-9.8 igraph_1.5.0
## [157] ggsignif_0.6.4 reshape2_1.4.4
## [159] biomaRt_2.56.1 ScaledMatrix_1.8.1
## [161] XML_3.99-0.14 evaluate_0.21
## [163] RcppParallel_5.1.7 BiocManager_1.30.22
## [165] nloptr_2.0.3 tweenr_2.0.2
## [167] foreach_1.5.2 httpuv_1.6.11
## [169] EnsDb.Mmusculus.v79_2.99.0 purrr_1.0.1
## [171] polyclip_1.10-4 km.ci_0.5-6
## [173] ggforce_0.4.1 rsvd_1.0.5
## [175] broom_1.0.5 xtable_1.8-4
## [177] restfulr_0.0.15 AnnotationFilter_1.24.0
## [179] rstatix_0.7.2 later_1.3.1
## [181] viridisLite_0.4.2 class_7.3-22
## [183] tibble_3.2.1 lmerTest_3.1-3
## [185] beeswarm_0.4.0 memoise_2.0.1
## [187] AnnotationDbi_1.62.2 GenomicAlignments_1.36.0
## [189] cluster_2.1.4 concaveman_1.1.0
## [191] GSEABase_1.62.0
The authors thank all their colleagues, particularly at The University of Sydney, Sydney Precision Data Science and Charles Perkins Centre for their support and intellectual engagement. Special thanks to Ellis Patrick, Shila Ghazanfar, Andy Tran for guiding and supporting the building of this workshop.